1 What Does Hive DML Do?

2 Load Comma-Delimited Data File

CREATE TABLE IF NOT EXISTS employees12 (
  name STRING COMMENT 'Employee name',
  salary FLOAT COMMENT 'Employee salary',
  FexTax FLOAT,
  StateTax FLOAT,
  Insurance FLOAT,
  address STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n';

3 Load Comma-Delimited Data Files

LOAD DATA LOCAL INPATH '/empdata-comma.txt'
OVERWRITE INTO TABLE employees12;

The LOCAL keyword:

4 Load Tab-Delimited Data File

CREATE TABLE IF NOT EXISTS employees11 (
  name STRING COMMENT 'Employee name',
  salary FLOAT COMMENT 'Employee salary',
  FedTax FLOAT,
  StateTax FLOAT,
  Insurance FLOAT,
  address STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
LINES TERMINATED BY '\n';

5 Load Tab-Delimited Data Files

LOAD DATA LOCAL INPATH '/empdata-tab.txt'
OVERWRITE INTO TABLE employees11;

6 Load Data of Complex Data Types

CREATE TABLE IF NOT EXISTS employees20 (
  name STRING COMMENT 'Employee name',
  salary FLOAT COMMENT 'Employee salary',
  subordinates ARRAY<STRING> COMMENT 'Names of subordinates',
  deductions MAP<STRING, FLOAT>
    COMMENT 'Keys are deductions names, values are percentages',
  address STRUCT<street:STRING, city:STRING, state:STRING, ZIP:INT>
    COMMENT 'Home address')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '^'
COLLECTION ITEMS TERMINATED BY '#'
MAP KEYS TERMINATED BY '@'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

7 Load Data of Complex Data Types

LOAD DATA LOCAL INPATH '/empdata.txt'
OVERWRITE INTO TABLE employees20;

8 Load Data of Complex Data Types into a Partitioned Table

CREATE TABLE IF NOT EXISTS employees23 (
  name STRING COMMENT 'Employee name',
  salary FLOAT COMMENT 'Employee salary',
  subordinates ARRAY<STRING> COMMENT 'Names of subordinates',
  deductions MAP<STRING, FLOAT>
    COMMENT 'Keys are deductions names, values are percentages',
  address STRUCT<street:STRING, city:STRING, state:STRING, ZIP:INT>
    COMMENT 'Home address')
PARTITIONED BY (country STRING, state STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '^'
COLLECTION ITEMS TERMINATED BY '#'
MAP KEYS TERMINATED BY '@'
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;

9 Load Data of Complex Data Types into a Partitioned Table (cont’d)

LOAD DATA LOCAL INPATH '/empdata-us-il.txt'
OVERWRITE INTO TABLE employees23
PARTITION (country = 'US', state = 'IL');


10 Query Partitioned Table of Complex Data Types

Select * from employees23;

11 Load Data of Complex Data Types into a Partitioned Table (cont’d)

LOAD DATA LOCAL INPATH '/empdata-us-ny.txt'
OVERWRITE INTO TABLE employees23
PARTITION (country = 'US', state = 'NY');


12 Load Data of Complex Data Types into a Partitioned Table (cont’d)

LOAD DATA LOCAL INPATH '/empdata-us-ca.txt'
OVERWRITE INTO TABLE employees23
PARTITION (country = 'US', state = 'CA');


13 Query Partitioned Table of Complex Data Types

See the whole table with three partitions:

Select * from employees23;

13.1 Using a Condition from a Partition

Select * from employees23 where state="NY";

13.2 Using a Condition from a Column (struct) field

Select * from employees23 where address.state="NY";

13.3 Using a Condition from a Column

Select name, salary, address.city, address.state
from employees23
where salary > 120000;

13.4 Using Conditions from Both a Column and a Partition

Select name, salary, address.city, address.state
from employees23
where salary > 120000 and (state = 'CA' or state = 'IL');

14 The Sub-Directories of a Partitioned Table

Those of the employees23 table:

.../employees23/country=US/state=CA
.../employees23/country=US/state=IL
.../employees23/country=US/state=NY

15 The Sub-Directories and the Files of a Partitioned Table

The location of the loaded file and its contents:

16 “Insert” Data into Tables from Queries

CREATE TABLE inserttbl like employees20;

INSERT OVERWRITE TABLE inserttbl
SELECT * FROM employees20
WHERE salary > 100000.0;

SELECT * FROM inserttbl;

17 Insert Data into Tables from Queries

INSERT OVERWRITE TABLE employees
PARTITION (country = 'US', state = 'OR')
SELECT * FROM staged_employees se
WHERE se.cnty = 'US' AND se.st = 'OR';
INSERT INTO TABLE employees
PARTITION (country = 'US', state = 'OR')
SELECT * FROM staged_employees se
WHERE se.cnty = 'US' AND se.st = 'OR';

18 An INSERT Syntax: Scan the Data Once and Split it Multiple Ways

FROM staged_employees se
INSERT OVERWRITE TABLE employees
  PARTITION (country = 'US', state = 'OR')
  SELECT * WHERE se.cnty = 'US' AND se.st = 'OR'
INSERT OVERWRITE TABLE employees
  PARTITION (country = 'US', state = 'CA')
  SELECT * WHERE se.cnty = 'US' AND se.st = 'CA'
INSERT OVERWRITE TABLE employees
  PARTITION (country = 'US', state = 'IL')
  SELECT * WHERE se.cnty = 'US' AND se.st = 'IL';

19 Dynamic Partition Inserts

Hive’s dynamic partition can infer the partitions to create based on query parameters:

INSERT OVERWRITE TABLE employees
PARTITION (country, state)
SELECT ..., se.cnty, se.st
FROM staged_employees se;

In this example, Hive determines the values of the partition keys, country and state, from the last two columns in the SELECT clause

Different names in staged_employees emphasize that the relationship between the source column values and the output partition values is by position only and not by matching on names

20 Mix Dynamic and Static Partitions

Static value for the country (US) and a dynamic value for the state. The static partition keys must come before the dynamic partition keys

INSERT OVERWRITE TABLE employees
PARTITION (country = 'US', state)
SELECT ..., se.cnty, se.st
FROM staged_employees se
WHERE se.cnty = 'US';

21 Dynamic Partitions Properties

Name Default Description
hive.exec.dynamic.partition false Set to true to enable dynamic partitioning
hive.exec.dynamic.partition.mode strict Set to nonstrict to enable all partitions to be determined dynamically
hive.exec.max.dynamic.partitions.pernode 100 The maximum number of dynamic partitions that can be created by each mapper or reducer. Raises a fatal error if one mapper or reducer attempts to create more than the threshold
hive.exec.max.dynamic.partitions +1000 The total number of dynamic partitions that can be created by one statement with dynamic partitioning. Raises a fatal error if the limit is exceeded
hive.exec.max.created.files 100000 The maximum total number of files that can be created globally. A Hadoop counter is used to track the number of files created. Raises a fatal error if the limit is exceeded

22 Set the Desired Properties Just Before Use

hive> set hive.exec.dynamic.partition=true;
hive> set hive.exec.dynamic.partition.mode=nonstrict;
hive> set hive.exec.max.dynamic.partitions.pernode=1000;
hive> INSERT OVERWRITE TABLE employees
    > PARTITION (country, state)
    > SELECT ..., se.cty, se.st
    > FROM staged_employees se;

23 Create Tables and Load Them in One Query

CREATE TABLE ca_employees
AS SELECT name, salary, address
FROM employees
WHERE state = 'CA';

24 Export Data

INSERT OVERWRITE LOCAL DIRECTORY '/tmp/ca_employees'
SELECT name, salary, address
FROM employees
WHERE state = 'CA';

Independent of how the data is actually stored in the source table, it is written to files with all fields serialized as strings

hive> ! ls /tmp/ca_employees;
000000_0
hive> ! cat /tmp/ca_employees/000000_0
John Doe100000.0201 San Antonio CircleMountain ViewCA94040
Mary Smith80000.01 Infinity LoopCupertinoCA95014
...